library(ggplot2)
library(reshape2)
library(dplyr)
library(tidyr)
library(GGally)
library(grid)
"%&%" = function(a,b) paste(a,b,sep="")
source('/Volumes/im-lab/nas40t2/hwheeler/PrediXcan-Paper/scripts/Heather/make-figures/multiplot.R')
my.dir <- '/Volumes/im-lab/nas40t2/hwheeler/cross-tissue/'
fig.dir <- '~/GitHub/GenArch/GenArchPaper/Figures/'
exp.dir <- my.dir %&% "gtex-rnaseq/ind-tissues-RPKM/"
h2.dir <- '/Volumes/im-lab/nas40t2/Data/Annotations/heritability/'
se <- function(x) sqrt(var(x)/length(x))
h2 <- read.table('/Volumes/im-lab/nas40t2/hwheeler/PrediXcan_CV/cis.v.trans.prediction/DGN-WB.localGRM.h2.exp.2014-08-30.txt',header=T)
rawcounts <- read.table('/Volumes/im-lab/nas40t2/Data/Transcriptome/WB1K/data_used_for_eqtl_study/raw_counts.txt',header=T)
expmean<-data.frame(mean.exp=colMeans(rawcounts),se.exp=apply(rawcounts,2,se)) %>%mutate(gene=as.factor(colnames(rawcounts)))
all <- inner_join(expmean,h2,by='gene',copy=TRUE)
ngenes<-dim(all)[[1]]
a<-ggplot(all,aes(x=log10(mean.exp),y=h2)) + geom_point() + xlab(expression(paste("log"[10],"(mean expression raw counts)"))) + ylab(expression("Local h"^2)) + ggtitle("DGN-WB (" %&% ngenes %&% " genes)") + theme_bw(20)+ geom_smooth(method = "lm")
a
summary(lm(h2~mean.exp,all))
##
## Call:
## lm(formula = h2 ~ mean.exp, data = all)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.12348 -0.10470 -0.06199 0.04329 0.81936
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.235e-01 1.380e-03 89.488 < 2e-16 ***
## mean.exp -3.825e-07 1.085e-07 -3.525 0.000425 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1558 on 13623 degrees of freedom
## Multiple R-squared: 0.0009114, Adjusted R-squared: 0.000838
## F-statistic: 12.43 on 1 and 13623 DF, p-value: 0.0004246
b<-ggplot(all,aes(x=log10(se.exp),y=h2)) + geom_point() + xlab(expression(paste("log"[10],"(SE expression raw counts)"))) + ylab(expression("Local h"^2)) + ggtitle("DGN-WB (" %&% ngenes %&% " genes)") + theme_bw(20)+ geom_smooth(method = "lm")
b
summary(lm(h2~se.exp,all))
##
## Call:
## lm(formula = h2 ~ se.exp, data = all)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.12283 -0.10476 -0.06223 0.04315 0.82021
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.228e-01 1.355e-03 90.641 <2e-16 ***
## se.exp -1.017e-05 4.057e-06 -2.508 0.0122 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1559 on 13623 degrees of freedom
## Multiple R-squared: 0.0004615, Adjusted R-squared: 0.0003881
## F-statistic: 6.289 on 1 and 13623 DF, p-value: 0.01216
tiff(filename=fig.dir %&% "FigSx-h2_v_exp_rawcounts.tiff",width=960,height=480)
multiplot(a,b,cols=2)
dev.off()
## quartz_off_screen
## 2
png(filename=fig.dir %&% "FigSx-h2_v_exp_rawcounts.png",width=960,height=480)
multiplot(a,b,cols=2)
dev.off()
## quartz_off_screen
## 2
###only include genes with h2>0.1
filall<-filter(all,h2>0.1)
ngenes<-dim(filall)[[1]]
a<-ggplot(filall,aes(x=log10(mean.exp),y=h2)) + geom_point() + xlab(expression(paste("log"[10],"(mean expression raw counts)"))) + ylab(expression("Local h"^2)) + ggtitle("DGN-WB (" %&% ngenes %&% " h2 > 0.1 genes)") + theme_bw(20)+ geom_smooth(method = "lm")
a
summary(lm(h2~mean.exp,filall))
##
## Call:
## lm(formula = h2 ~ mean.exp, data = filall)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.17465 -0.12825 -0.05403 0.08148 0.66783
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.748e-01 2.409e-03 114.102 <2e-16 ***
## mean.exp -1.705e-07 2.345e-07 -0.727 0.467
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1652 on 5055 degrees of freedom
## Multiple R-squared: 0.0001046, Adjusted R-squared: -9.321e-05
## F-statistic: 0.5288 on 1 and 5055 DF, p-value: 0.4671
b<-ggplot(filall,aes(x=log10(se.exp),y=h2)) + geom_point() + xlab(expression(paste("log"[10],"(SE expression raw counts)"))) + ylab(expression("Local h"^2)) + ggtitle("DGN-WB (" %&% ngenes %&% " h2 > 0.1 genes)") + theme_bw(20)+ geom_smooth(method = "lm")
b
summary(lm(h2~se.exp,filall))
##
## Call:
## lm(formula = h2 ~ se.exp, data = filall)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.17470 -0.12809 -0.05404 0.08182 0.66813
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.743e-01 2.396e-03 114.511 <2e-16 ***
## se.exp 6.629e-07 1.259e-05 0.053 0.958
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1652 on 5055 degrees of freedom
## Multiple R-squared: 5.487e-07, Adjusted R-squared: -0.0001973
## F-statistic: 0.002774 on 1 and 5055 DF, p-value: 0.958
we do not see h2 dependence on expression level in contrast to what is reported for array based h2 estimates–see Wright et al Table 2.
TO DO: plot a) array based h2 vs. RPKM in gtex, use alkes and or fred wright’s h2 estimates b) RNAseq h2 vs RPKM in gtex
##get GTEx TW local h2
twh2 <- read.table(my.dir %&% "GTEx_Tissue-Wide_local_h2_se_geneinfo_no_description.txt",header=TRUE)
twh2 <- dplyr::rename(twh2,gene=AssociatedGeneName,ensid=EnsemblGeneID)
##get price h2 (custom-made human array containing 23,720 unique oligonucleotide probes, http://www.nature.com/nature/journal/v452/n7186/full/nature06758.html)
price <- read.table(h2.dir %&% "Alkes/h2all.txt",header=TRUE) %>% dplyr::rename(gene=gname)
head(price)
## gene h2adip h2blood h2adipcis h2adiptra h2bloodcis h2bloodtra
## 1 A2BP1 0.361 0.038 0.151 0.235 0.210 0.376
## 2 A2M 0.193 0.228 0.199 -0.006 0.005 0.223
## 3 A2ML1 0.125 0.130 0.003 0.122 0.067 0.063
## 4 A3GALT2 0.231 0.058 0.225 0.006 -0.101 0.159
## 5 A4GALT 0.176 0.322 -0.275 0.451 0.364 -0.042
## 6 A4GNT 0.629 0.265 -0.039 0.668 0.051 0.214
summary(price)
## gene h2adip h2blood h2adipcis
## A2BP1 : 1 Min. :-0.2470 Min. :-0.2550 Min. :-0.42200
## A2M : 1 1st Qu.: 0.1370 1st Qu.: 0.0510 1st Qu.:-0.03100
## A2ML1 : 1 Median : 0.2345 Median : 0.1400 Median : 0.06100
## A3GALT2: 1 Mean : 0.2511 Mean : 0.1679 Mean : 0.07464
## A4GALT : 1 3rd Qu.: 0.3480 3rd Qu.: 0.2560 3rd Qu.: 0.16500
## A4GNT : 1 Max. : 0.9640 Max. : 0.9030 Max. : 0.89100
## (Other):15072 NA's :253
## h2adiptra h2bloodcis h2bloodtra
## Min. :-0.4570 Min. :-0.39200 Min. :-0.4480
## 1st Qu.: 0.0730 1st Qu.:-0.02500 1st Qu.: 0.0030
## Median : 0.1940 Median : 0.05600 Median : 0.1080
## Mean : 0.1974 Mean : 0.07145 Mean : 0.1128
## 3rd Qu.: 0.3170 3rd Qu.: 0.14900 3rd Qu.: 0.2170
## Max. : 1.0250 Max. : 0.69000 Max. : 0.8090
## NA's :253 NA's :253
a<-inner_join(twh2,price,"gene")
dim(a)
## [1] 11360 28
ggplot(a,aes(h2.WholeBlood,h2bloodcis)) + geom_point(alpha=1/4) + geom_smooth() + theme_bw()
cor.test(a$h2.WholeBlood,a$h2bloodcis,method='s')
##
## Spearman's rank correlation rho
##
## data: a$h2.WholeBlood and a$h2bloodcis
## S = 2.0757e+11, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.1077962
cor.test(a$h2.WholeBlood,a$h2blood,method='s')
##
## Spearman's rank correlation rho
##
## data: a$h2.WholeBlood and a$h2blood
## S = 2.098e+11, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.09823263
cor.test(a$h2.WholeBlood,a$h2bloodtra,method='s')
##
## Spearman's rank correlation rho
##
## data: a$h2.WholeBlood and a$h2bloodtra
## S = 2.3193e+11, p-value = 0.7432
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.003099754
tislist <- scan(my.dir %&% "nine.spaces.tissue.list","c",sep="\n")
rpkm <- data.frame()
for(tis in tislist){
tisrpkm <- read.table(exp.dir %&% tis %&% ".meanRPKM.txt",header=TRUE)
combo <- inner_join(a,tisrpkm,by='ensid') %>% dplyr::select(ensid,meanRPKM,h2adipcis,h2bloodcis) %>% mutate(tissue=tis)
res<-cor.test(combo$h2bloodcis,log10(combo$meanRPKM),method='p')
cat(tis," RPKM v Price h2bloodcis: R=",signif(res$estimate,2)," p=",signif(res$p.value,2),"\n",sep="")
res<-cor.test(combo$h2adipcis,log10(combo$meanRPKM),method='p')
cat(tis," RPKM v Price h2adipcis: R=",signif(res$estimate,2)," p=",signif(res$p.value,2),"\n",sep="")
rpkm <- rbind(rpkm,combo)
}
## Adipose - Subcutaneous RPKM v Price h2bloodcis: R=0.11 p=0
## Adipose - Subcutaneous RPKM v Price h2adipcis: R=0.12 p=0
## Artery - Tibial RPKM v Price h2bloodcis: R=0.096 p=0
## Artery - Tibial RPKM v Price h2adipcis: R=0.12 p=0
## Heart - Left Ventricle RPKM v Price h2bloodcis: R=0.095 p=0
## Heart - Left Ventricle RPKM v Price h2adipcis: R=0.12 p=0
## Lung RPKM v Price h2bloodcis: R=0.12 p=0
## Lung RPKM v Price h2adipcis: R=0.1 p=0
## Muscle - Skeletal RPKM v Price h2bloodcis: R=0.081 p=0
## Muscle - Skeletal RPKM v Price h2adipcis: R=0.11 p=0
## Nerve - Tibial RPKM v Price h2bloodcis: R=0.1 p=0
## Nerve - Tibial RPKM v Price h2adipcis: R=0.12 p=0
## Skin - Sun Exposed (Lower leg) RPKM v Price h2bloodcis: R=0.078 p=0
## Skin - Sun Exposed (Lower leg) RPKM v Price h2adipcis: R=0.083 p=0
## Thyroid RPKM v Price h2bloodcis: R=0.11 p=0
## Thyroid RPKM v Price h2adipcis: R=0.12 p=0
## Whole Blood RPKM v Price h2bloodcis: R=0.14 p=0
## Whole Blood RPKM v Price h2adipcis: R=0.068 p=3.7e-13
ggplot(rpkm,aes(x=log10(meanRPKM),y=h2bloodcis))+geom_point(alpha=1/4)+geom_smooth(method="lm")+facet_wrap(~tissue)+ggtitle("Price blood h2 vs. GTEx mean RPKM")+theme_bw()
ggplot(rpkm,aes(x=log10(meanRPKM),y=h2adipcis))+geom_point(alpha=1/4)+geom_smooth(method="lm")+facet_wrap(~tissue)+ggtitle("Price adipose h2 vs. GTEx mean RPKM")+theme_bw()
##price h2 vs. DGN raw counts
dgnprice <- inner_join(all,price,by='gene')
ggplot(dgnprice,aes(x=log10(mean.exp),y=h2bloodcis))+ geom_point(alpha=1/2) + geom_smooth() + theme_bw()+ggtitle("Price blood h2 vs. DGN mean counts")
cor.test(log10(dgnprice$mean.exp),dgnprice$h2bloodcis,method='s')
##
## Spearman's rank correlation rho
##
## data: log10(dgnprice$mean.exp) and dgnprice$h2bloodcis
## S = 1.3242e+11, p-value = 0.2269
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.01253105
ggplot(dgnprice,aes(x=log10(mean.exp),y=h2adipcis))+ geom_point(alpha=1/2) + geom_smooth() + theme_bw()+ggtitle("Price adipose h2 vs. DGN mean counts")
cor.test(log10(dgnprice$mean.exp),dgnprice$h2adipcis,method='s')
##
## Spearman's rank correlation rho
##
## data: log10(dgnprice$mean.exp) and dgnprice$h2adipcis
## S = 1.3912e+11, p-value = 0.5721
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.00581726
##get GTEx TW local h2
twh2 <- read.table(my.dir %&% "GTEx_Tissue-Wide_local_h2_se_geneinfo_no_description.txt",header=TRUE)
twh2 <- dplyr::rename(twh2,gene=AssociatedGeneName,ensid=EnsemblGeneID)
wright <- read.table(h2.dir %&% "Fred/h2PB.txt",header=TRUE) %>% dplyr::rename(gene=gname)
head(wright)
## gene h2PB
## 1 A1BG -0.25582491
## 2 A1CF 0.05112739
## 3 A2M 0.49116590
## 4 A2ML1 0.10640690
## 5 A3GALT2 0.20743214
## 6 A4GALT 0.03834043
summary(wright)
## gene h2PB
## A1BG : 1 Min. :-0.40792
## A1CF : 1 1st Qu.: 0.01551
## A2M : 1 Median : 0.10140
## A2ML1 : 1 Mean : 0.10510
## A3GALT2: 1 3rd Qu.: 0.18750
## A4GALT : 1 Max. : 0.90467
## (Other):17786
a<-inner_join(twh2,wright,"gene")
dim(a)
## [1] 15805 23
ggplot(a,aes(h2.WholeBlood,h2PB)) + geom_point(alpha=1/4) + geom_smooth() + theme_bw()
cor.test(a$h2.WholeBlood,a$h2PB,method='s')
##
## Spearman's rank correlation rho
##
## data: a$h2.WholeBlood and a$h2PB
## S = 6.2666e+11, p-value = 2.055e-09
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.04764976
tislist <- scan(my.dir %&% "nine.spaces.tissue.list","c",sep="\n")
rpkm <- data.frame()
for(tis in tislist){
tisrpkm <- read.table(exp.dir %&% tis %&% ".meanRPKM.txt",header=TRUE)
combo <- inner_join(a,tisrpkm,by='ensid') %>% dplyr::select(ensid,meanRPKM,h2PB) %>% mutate(tissue=tis)
res<-cor.test(combo$h2PB,log10(combo$meanRPKM),method='p')
cat(tis," RPKM v Wright h2PB: R=",signif(res$estimate,2)," p=",signif(res$p.value,2),"\n",sep="")
rpkm <- rbind(rpkm,combo)
}
## Adipose - Subcutaneous RPKM v Wright h2PB: R=0.19 p=0
## Artery - Tibial RPKM v Wright h2PB: R=0.18 p=0
## Heart - Left Ventricle RPKM v Wright h2PB: R=0.17 p=0
## Lung RPKM v Wright h2PB: R=0.23 p=0
## Muscle - Skeletal RPKM v Wright h2PB: R=0.15 p=0
## Nerve - Tibial RPKM v Wright h2PB: R=0.18 p=0
## Skin - Sun Exposed (Lower leg) RPKM v Wright h2PB: R=0.15 p=0
## Thyroid RPKM v Wright h2PB: R=0.18 p=0
## Whole Blood RPKM v Wright h2PB: R=0.29 p=0
ggplot(rpkm,aes(x=log10(meanRPKM),y=h2PB))+geom_point(alpha=1/4)+geom_smooth(method="lm")+facet_wrap(~tissue)+ggtitle("Wright blood h2 vs. GTEx mean RPKM")+theme_bw()
##wright h2 vs. DGN raw counts
dgnwright <- inner_join(all,wright,by='gene')
ggplot(dgnwright,aes(x=log10(mean.exp),y=h2PB))+ geom_point(alpha=1/2) + geom_smooth() + theme_bw()+ggtitle("Wright blood h2 vs. DGN mean counts")
cor.test(log10(dgnwright$mean.exp),dgnwright$h2PB,method='s')
##
## Spearman's rank correlation rho
##
## data: log10(dgnwright$mean.exp) and dgnwright$h2PB
## S = 2.3087e+11, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.2651532
tislist <- scan(my.dir %&% 'nine.tissue.list',sep="\n",what="character")
tislist2 <- scan(my.dir %&% "nine.spaces.tissue.list","c",sep="\n")
tw <- data.frame()
for(i in 1:length(tislist)){
tisrpkm <- read.table(exp.dir %&% tislist2[i] %&% ".meanRPKM.txt",header=TRUE)
h2 <- read.table(my.dir %&% "gtex-h2-estimates/GTEx.tissue-wide.h2_" %&% tislist[i] %&% "_marginal.local_2015-03-24.txt",header=T, sep="\t") %>% select(tissue,ensid,h2)
subdata <- inner_join(h2,tisrpkm,by="ensid")
res<-cor.test(log10(subdata$meanRPKM),subdata$h2)
cat(tislist[i],"\tPearson R=",round(res$estimate,3),"\tP-value=",res$p.value,"\n")
tw <- rbind(tw,subdata)
}
## Adipose-Subcutaneous Pearson R= 0.072 P-value= 0
## Artery-Tibial Pearson R= 0.06 P-value= 4.440892e-16
## Heart-LeftVentricle Pearson R= 0.048 P-value= 6.506884e-11
## Lung Pearson R= 0 P-value= 0.9709712
## Muscle-Skeletal Pearson R= 0.042 P-value= 9.302326e-09
## Nerve-Tibial Pearson R= 0.095 P-value= 0
## Skin-SunExposed(Lowerleg) Pearson R= 0.08 P-value= 0
## Thyroid Pearson R= 0.059 P-value= 4.440892e-16
## WholeBlood Pearson R= 0.083 P-value= 0
ggplot(tw,aes(x=log10(meanRPKM),y=h2))+geom_point(alpha=0.4)+ylab(expression("GCTA h"^2))+xlab(expression(paste("log"[10],"(meanRPKM)")))+geom_smooth(method="lm") + facet_wrap(~tissue,ncol=3)+theme_bw()+ggtitle("GTEx TW local h2 vs. GTEx mean RPKM")